1 Libraries

This workflow reanalyze the single nucleus RNA-seq data produced by (Habib et al. 2017), using DroNc-seq: massively parallel sNuc-seq with droplet technology.

We will first load a few libraries. Among them, * DropletUtils provides functions for data from droplet technologies such as 10X Genomics. * biomaRt provides easy access to databases, such as Ensembl, COSMIC, Uniprot, HGNC, etc. * scater is a collection of tools for doing quality control analyses of scRNA-seq * scran provide functions for normalization of cell-specific libary sizes, correcting batch effects, and identification marker genes

bio_packs = c("SingleCellExperiment","DropletUtils","biomaRt",
              "scater","scran","limma","org.Hs.eg.db","SC3")
bio_packs = bio_packs[! bio_packs %in% installed.packages()[,"Package"]]

if( length(bio_packs) > 0 ){
  source("https://bioconductor.org/biocLite.R")
  biocLite(bio_packs, suppressUpdates = TRUE)
}

cran_packs = c("stringi","irlba")
cran_packs = cran_packs[! cran_packs %in% installed.packages()[,"Package"]]

if( length(cran_packs) > 0 ){
  install.packages(cran_packs)
}

library(data.table)
library(SingleCellExperiment)
library(DropletUtils)
library(biomaRt)
library(scater)
library(scran)
library(limma)
library(ggplot2)

src_dir   = "~/research/GitHub/scRNAseq_pipelines/dronc"
data_dir  = "~/research/scRNAseq/workflow/data"
dronc_dir = "~/research/scRNAseq/data/GTEx_droncseq_hip_pcf"

2 Obtaining/Loading Counts

Next we read in the count data. The dataset is available here.

fname  = file.path(dronc_dir, "GTEx_droncseq_hip_pcf.umi_counts.txt.gz")
counts = fread(paste('zcat <', fname))
dim(counts)
## [1] 32111 14964
counts[1:3,1:2]
##          V1 hHP1_AACACTATCTAC
## 1:     A1BG                 0
## 2: A1BG-AS1                 0
## 3:     A1CF                 0
counts.mx = as.matrix(counts[,-1])
rownames(counts.mx) = counts$V1
sce = SingleCellExperiment(assays = list(counts = counts.mx))
sce
## class: SingleCellExperiment 
## dim: 32111 14963 
## metadata(0):
## assays(1): counts
## rownames(32111): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(0):
## reducedDimNames(0):
## spikeNames(0):

3 Pre-processing and Quality Control

3.1 Gene Annotation

We will extract annotation information based on gene names

anno.file = file.path(data_dir, "gene.annoation_dronc.rds")
if(file.exists(anno.file)){
  gene_anno = readRDS(anno.file)
}else{

  ensembl = useEnsembl(biomart="ensembl",GRCh=37,
                       dataset="hsapiens_gene_ensembl")
  
  attr_string = c("hgnc_symbol","external_gene_name","chromosome_name",
                  "start_position","end_position","strand","description",
                  "percentage_gene_gc_content","gene_biotype")
  
  gene_anno = getBM(attributes = attr_string,
                    filters = "external_gene_name",
                    values = rownames(sce),
                    mart = ensembl)
  saveRDS(gene_anno, file=anno.file)
}

dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 34526     9
w2rm = which(!gene_anno$external_gene_name %in% rownames(sce))
w2rm
## [1] 4232 4645
gene_anno[w2rm,]
##      hgnc_symbol external_gene_name chromosome_name start_position
## 4232                       C10ORF68              10       32832297
## 4645                       C1ORF220               1      178511950
##      end_position strand
## 4232     33171802      1
## 4645    178517579      1
##                                                                                                               description
## 4232 Homo sapiens coiled-coil domain containing 7 (CCDC7), transcript variant 5, mRNA. [Source:RefSeq mRNA;Acc:NM_024688]
## 4645                                                                                                                     
##      percentage_gene_gc_content   gene_biotype
## 4232                      37.23 protein_coding
## 4645                      49.91 protein_coding
gene_anno = gene_anno[-w2rm,]
dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 34524     9

Many genes have multiple entries in annotation, often because they are annotatd to scaffolds, assembly patches and alternate loci. Here we simply remove such entries. The we remove duplicated annotations and genes without annotations.

table(gene_anno$chromosome_name)[1:30]
## 
##          1         10         11         12         13         14 
##       3084       1252       1741       1694        637       1145 
##         15         16         17         18         19          2 
##       1169       1426       1820        633       1955       2186 
##         20         21         22          3          4          5 
##        800        383        704       1784       1301       1567 
##          6          7          8          9 GL000192.1 GL000193.1 
##       1622       1446       1296       1207          2          1 
## GL000194.1 GL000195.1 GL000199.1 GL000204.1 GL000205.1 GL000213.1 
##          2          1          1          1          2          1
chr.nms = c(1:22,"X","Y","MT")
gene_anno = gene_anno[which(gene_anno$chromosome_name %in% chr.nms),]
dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 31978     9
t1 = table(gene_anno$external_gene_name)
t2 = sort(t1[t1 > 1], decreasing=TRUE)
length(t2)
## [1] 40
t2[1:10]
## 
##     SNORD113    MIR1302-2       NPIPA7      CCDC177        CDRT1 
##            6            4            3            2            2 
##    CEBPA-AS1      FAM226B FAM47E-STBD1         GATS      GOLGA7B 
##            2            2            2            2            2
gene_anno[which(gene_anno$external_gene_name %in% names(t2)[1:4]), 1:4]
##       hgnc_symbol external_gene_name chromosome_name start_position
## 5013      CCDC177            CCDC177              14       70037483
## 5014                         CCDC177              14       70038216
## 14832  MIR1302-11          MIR1302-2              19          71161
## 14833  MIR1302-10          MIR1302-2              19          71161
## 14834   MIR1302-9          MIR1302-2              19          71161
## 14835   MIR1302-2          MIR1302-2              19          71161
## 16502                         NPIPA7              16       16411301
## 16503      NPIPA7             NPIPA7              16       16472912
## 16505                         NPIPA7              16       18451943
## 29789                       SNORD113              14      101422577
## 29810                       SNORD113              14      101443726
## 29811                       SNORD113              14      101445339
## 29812                       SNORD113              14      101446329
## 29825                       SNORD113              14      101460594
## 29826                       SNORD113              14      101464804
w.duplicate = which(gene_anno$external_gene_name %in%  names(t2))
ganno2 = gene_anno[w.duplicate,]
dim(ganno2)
## [1] 87  9
table(ganno2$hgnc_symbol == ganno2$external_gene_name)
## 
## FALSE  TRUE 
##    46    41
ganno2 = ganno2[which(ganno2$hgnc_symbol == ganno2$external_gene_name),]
dim(ganno2)
## [1] 41  9
ganno2 = dplyr::distinct(ganno2,external_gene_name,.keep_all = TRUE)
dim(ganno2)
## [1] 39  9
gene_anno = rbind(gene_anno[-w.duplicate,], ganno2)
dim(gene_anno)
## [1] 31930     9
table(gene_anno$gene_biotype)
## 
## 3prime_overlapping_ncrna                antisense                IG_C_gene 
##                       11                     4152                        3 
##                IG_V_gene                  lincRNA                    miRNA 
##                        6                     4807                      966 
##                 misc_RNA                  Mt_rRNA                  Mt_tRNA 
##                      935                        2                       18 
##     processed_transcript           protein_coding               pseudogene 
##                      394                    18296                        3 
##                     rRNA           sense_intronic        sense_overlapping 
##                      204                      648                      172 
##                   snoRNA                    snRNA                TR_C_gene 
##                      201                     1083                        5 
##                TR_J_gene                TR_V_gene 
##                        7                       17
gene_missing = setdiff(rownames(sce),gene_anno$external_gene_name)
gene_missing[1:10]
##  [1] "AC005152.2" "AC006132.1" "AC006445.8" "AC007092.1" "AC007390.5"
##  [6] "AC007464.1" "AC009232.2" "AC009236.1" "AC010127.5" "AC011043.1"
length(gene_missing)
## [1] 181
w2kp = match(gene_anno$external_gene_name,rownames(sce))
table(is.na(w2kp))
## 
## FALSE 
## 31930
gene_anno$external_gene_name[which(is.na(w2kp))]
## character(0)
sce = sce[w2kp,]
dim(sce)
## [1] 31930 14963
rowData(sce) = gene_anno

3.2 Identify Low quality cells

3.2.1 barcodeRanks filtering

Please refer to this workflow in bioconductor for reference.

# Calling cells from empty droplets
bcrank = barcodeRanks(counts(sce))

# Only show unique points for plotting speed.
uniq = !duplicated(bcrank$rank)

par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy", 
     xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)

abline(h=bcrank$inflection, col="darkgreen", lty=2,lwd=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2,lwd=2)

legend("left", legend=c("Inflection", "Knee"), bty="n", 
       col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2,lwd=2)

bcrank$inflection
## hCf_TAATAGCGGTAC 
##              221
bcrank$knee
## PFC2-A2_CACAGCGCTGTC 
##                  580
summary(bcrank$total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   218.0   459.0   716.0   948.9  1167.0  9423.0
table(bcrank$total >= bcrank$knee)
## 
## FALSE  TRUE 
##  5513  9450
table(bcrank$total >= bcrank$inflection)
## 
## FALSE  TRUE 
##     1 14962
set.seed(100)
e.out = emptyDrops(counts(sce))
e.out
## DataFrame with 14963 rows and 5 columns
##                         Total   LogProb    PValue   Limited       FDR
##                     <integer> <numeric> <numeric> <logical> <numeric>
## hHP1_AACACTATCTAC        4484       NaN         1     FALSE         0
## hHP1_CTACGCATCCAT        4919       NaN         1     FALSE         0
## hHP1_TCGGTACTAATA        5163       NaN         1     FALSE         0
## hHP1_CCCGCACGCTAT        5030       NaN         1     FALSE         0
## hHP1_TCATTTTGTCAT        3588       NaN         1     FALSE         0
## ...                       ...       ...       ...       ...       ...
## PFC-CD_TTGCCTGGCGGG       590       NaN         1     FALSE         0
## PFC-CD_CACGCTCCCCTA       269       NaN         1     FALSE         1
## PFC-CD_GCTCTACAACCG       522       NaN         1     FALSE         1
## PFC-CD_CTCCATTCATGC       497       NaN         1     FALSE         1
## PFC-CD_CGTCATTAGCAG       568       NaN         1     FALSE         1
length(unique(e.out$FDR))
## [1] 2
table(e.out$FDR)
## 
##    0    1 
## 9450 5513
tapply(e.out$Total, e.out$FDR, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     581     748    1011    1268    1487    9423 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   218.0   317.0   391.0   402.3   489.0   580.0

From the above analysis, some cells with very small number of UMI’s. Here we chose do not remove any cells because it appears all these 14,963 cells were used in the main anlaysis. Based on Figure 2 of their paper, there are

14,963 DroNc-seq nuclei profiles (each with >10,000 reads and >200 genes)

3.2.2 Incorporate information of mitochondira/ribosomal genes in QC metrics

We will generate a set of QC features per cell, including the expression of mitochondira/ribosomal genes. We identify ribosomal genes based on annoation from https://www.genenames.org/.

file_link = "https://www.genenames.org/cgi-bin/genefamilies/set/1054/download/branch"
file_name = "ribosome_genes.txt"
ribo_fn   = file.path(data_dir, file_name)

if( !file.exists(ribo_fn) ){
  cmd = sprintf("cd %s; wget %s",data_dir, file_link)
  print(cmd)
  system(cmd)
}

ribo = read.table(ribo_fn, sep='\t', header=TRUE, stringsAsFactors = FALSE)
ribo[1:2,]
##   HGNC.ID Approved.Symbol          Approved.Name   Status Previous.Symbols
## 1   10298           RPL10  ribosomal protein L10 Approved                 
## 2   10299          RPL10A ribosomal protein L10a Approved            NEDD6
##                                  Synonyms Chromosome Accession.Numbers
## 1 NOV, QM, DXS648E, DXS648, FLJ23544, L10       Xq28          AB007170
## 2                            Csa-19, L10A    6p21.31            U12404
##   RefSeq.IDs Gene.Family.Tag Gene.family.description Gene.family.ID
## 1  NM_006013             RPL    L ribosomal proteins            729
## 2  NM_007104             RPL    L ribosomal proteins            729
is_mito = which(rowData(sce)$chromosome_name == "MT")
is_ribo = which(rowData(sce)$hgnc_symbol %in% ribo$Approved.Symbol)
length(is_mito)
## [1] 33
length(is_ribo)
## [1] 160
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is_mito, Ri=is_ribo))
sort(colnames(colData(sce)))
##  [1] "is_cell_control"                               
##  [2] "log10_total_counts"                            
##  [3] "log10_total_counts_endogenous"                 
##  [4] "log10_total_counts_feature_control"            
##  [5] "log10_total_counts_Mt"                         
##  [6] "log10_total_counts_Ri"                         
##  [7] "log10_total_features"                          
##  [8] "log10_total_features_by_counts"                
##  [9] "log10_total_features_by_counts_endogenous"     
## [10] "log10_total_features_by_counts_feature_control"
## [11] "log10_total_features_by_counts_Mt"             
## [12] "log10_total_features_by_counts_Ri"             
## [13] "log10_total_features_endogenous"               
## [14] "log10_total_features_feature_control"          
## [15] "log10_total_features_Mt"                       
## [16] "log10_total_features_Ri"                       
## [17] "pct_counts_endogenous"                         
## [18] "pct_counts_feature_control"                    
## [19] "pct_counts_in_top_100_features"                
## [20] "pct_counts_in_top_100_features_endogenous"     
## [21] "pct_counts_in_top_100_features_feature_control"
## [22] "pct_counts_in_top_100_features_Ri"             
## [23] "pct_counts_in_top_200_features"                
## [24] "pct_counts_in_top_200_features_endogenous"     
## [25] "pct_counts_in_top_50_features"                 
## [26] "pct_counts_in_top_50_features_endogenous"      
## [27] "pct_counts_in_top_50_features_feature_control" 
## [28] "pct_counts_in_top_50_features_Ri"              
## [29] "pct_counts_in_top_500_features"                
## [30] "pct_counts_in_top_500_features_endogenous"     
## [31] "pct_counts_Mt"                                 
## [32] "pct_counts_Ri"                                 
## [33] "pct_counts_top_100_features"                   
## [34] "pct_counts_top_100_features_endogenous"        
## [35] "pct_counts_top_100_features_feature_control"   
## [36] "pct_counts_top_100_features_Ri"                
## [37] "pct_counts_top_200_features"                   
## [38] "pct_counts_top_200_features_endogenous"        
## [39] "pct_counts_top_50_features"                    
## [40] "pct_counts_top_50_features_endogenous"         
## [41] "pct_counts_top_50_features_feature_control"    
## [42] "pct_counts_top_50_features_Ri"                 
## [43] "pct_counts_top_500_features"                   
## [44] "pct_counts_top_500_features_endogenous"        
## [45] "total_counts"                                  
## [46] "total_counts_endogenous"                       
## [47] "total_counts_feature_control"                  
## [48] "total_counts_Mt"                               
## [49] "total_counts_Ri"                               
## [50] "total_features"                                
## [51] "total_features_by_counts"                      
## [52] "total_features_by_counts_endogenous"           
## [53] "total_features_by_counts_feature_control"      
## [54] "total_features_by_counts_Mt"                   
## [55] "total_features_by_counts_Ri"                   
## [56] "total_features_endogenous"                     
## [57] "total_features_feature_control"                
## [58] "total_features_Mt"                             
## [59] "total_features_Ri"
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="", 
    breaks=20, col="grey80", ylab="Number of cells")

hist(log10(sce$total_features), xlab="log10(# of expressed genes)", 
     main="", breaks=20, col="grey80", ylab="Number of cells")

hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
    ylab="Number of cells", breaks=40, main="", col="grey80")

hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)", 
    ylab="Number of cells", breaks=80, main="", col="grey80")

par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
plot(log10(sce$total_counts), log10(sce$total_features), 
     xlab="log10(Library sizes)", ylab="log10(# of expressed genes)", 
     pch=16, col=rgb(0,0,0,0.4))
plot(log10(sce$total_counts), sce$pct_counts_Ri, pch=16, col=rgb(0,0,0,0.4), 
     xlab="log10(Library sizes)", ylab="Ribosome prop. (%)")

plot(log10(sce$total_counts), sce$pct_counts_Mt, pch=16, col=rgb(0,0,0,0.4), 
     xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)")

plot(x=sce$pct_counts_Ri, y=sce$pct_counts_Mt, pch=16, col=rgb(0,0,0,0.4), 
     xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)")

3.3 Summarize gene level information

rowData(sce)[1:2,]
## DataFrame with 2 rows and 20 columns
##   hgnc_symbol external_gene_name chromosome_name start_position
##   <character>        <character>     <character>      <integer>
## 1                    AC006946.16              22       17548711
## 2                    AC006946.17              22       17561591
##   end_position    strand description percentage_gene_gc_content
##      <integer> <integer> <character>                  <numeric>
## 1     17551565         1                                  41.61
## 2     17562346         1                                  41.67
##   gene_biotype is_feature_control is_feature_control_Mt
##    <character>          <logical>             <logical>
## 1      lincRNA              FALSE                 FALSE
## 2      lincRNA              FALSE                 FALSE
##   is_feature_control_Ri         mean_counts   log10_mean_counts
##               <logical>           <numeric>           <numeric>
## 1                 FALSE 0.00548018445498897  0.0023735161394708
## 2                 FALSE 0.00360890195816347 0.00156450482886486
##   n_cells_by_counts pct_dropout_by_counts total_counts log10_total_counts
##           <integer>             <numeric>    <integer>          <numeric>
## 1                79      99.4720310098242           82   1.91907809237607
## 2                52      99.6524761077324           54   1.74036268949424
##   n_cells_counts pct_dropout_counts
##        <integer>          <numeric>
## 1             79   99.4720310098242
## 2             52   99.6524761077324
min(rowData(sce)$mean_counts)
## [1] 6.683152e-05
min(rowData(sce)$mean_counts[rowData(sce)$mean_counts>0])
## [1] 6.683152e-05
min(rowData(sce)$n_cells_counts)
## [1] 1
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80",  main="", 
     breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_counts+1), col="grey80", main="", 
     breaks=40, xlab="log10(# of expressed cells + 1)")
plot(log10(rowData(sce)$mean_counts+1e-6), pch=16, col=rgb(0,0,0,0.4), 
              log10(rowData(sce)$n_cells_counts + 1), 
              xlab="log10(ave # of UMI + 1e-6)", 
              ylab="log10(# of expressed cells + 1)")

tb1 = table(rowData(sce)$n_cells_counts)
tb1[1:11]
## 
##    1    2    3    4    5    6    7    8    9   10   11 
## 3302 1958 1280 1029  843  667  552  525  473  383  369

We remove those genes that are lowly expressed. (Habib et al. 2017) mentioned in section “Gene detection and quality controls”" of Online Methods

Nuclei with less than 200 detected genes and less than 10,000 usable reads were filtered out.

and

A gene is considered detected in a cell if it has at least two unique UMIs (transcripts) associated with it. For each analysis, genes were removed that were detected in less than 10 nuclei.

Therefore it seems we should remove all the cells having less than 200 genes with two or more UMI counts. However, this would remove majorify of the cells. Therefore, we conlcude that they meant to remove the cells having less than 200 genes with one or more UMI counts. To filter genes, we follow their threshold to remove genes with two or more UMIs in less than 10 nuclei.

Note taht the variable strand need to be renamed, otherwise there is an error message that such a variabel name cannot be used.

names(rowData(sce))[6] = "strand_n"

n.genes = colSums(counts(sce) >= 2)
summary(n.genes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     6.0    42.0    71.0   112.1   132.0  1622.0
table(n.genes >= 100)
## 
## FALSE  TRUE 
##  9635  5328
table(n.genes >= 200)
## 
## FALSE  TRUE 
## 12902  2061
n.genes = colSums(counts(sce) >= 1)
summary(n.genes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   198.0   336.0   528.0   656.8   824.0  4233.0
table(n.genes >= 100)
## 
##  TRUE 
## 14963
table(n.genes >= 200)
## 
## FALSE  TRUE 
##     3 14960
n.cells = rowSums(counts(sce) >= 2)
summary(n.cells)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     2.00    52.53    26.00 13369.00
table(n.cells >= 10)
## 
## FALSE  TRUE 
## 20649 11281
sce = sce[which(n.cells >= 10),]
dim(sce)
## [1] 11281 14963

Next we check those highly expressed genes

par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1, 
        names.arg=rowData(sce)$hgnc_symbol[od1[20:1]], 
        horiz=TRUE, cex.names=0.8, cex.axis=0.8, 
        xlab="ave # of UMI")

3.4 Normalization

A simple solution for normalization and stablizing expression varaince across genes is to tranform the count data by log(count/size.factor + 1). One may calcualte size.factor per cell as the total number of UMIs, and this assumes the total expression are the same across all the cells. However, the total expression of each cell may vary with respect to cell type and/or cell size, and the computeSumFactors function in R package scran provides a more sophisicated way to calcualte size.factor to allow such variaation across cells (Lun, Bach, and Marioni 2016). computeSumFactors can use initial clustering of cells to normalize expression within and beetween clusters. Within a cluster, it estimates the size factor for many groups of cells so that there are more groups than cells, and then it can calcualte the size factor per cell using a lienar deconvolution system.

As shown in the following plot, the final size factor estimation is indeed highly correlated with the naive definition by total count.

Finally, the command normalize(sce) adds the normalized expression into the variable sce, which can be accessed by logcounts(sce) = log2(gene_cell_count / size_factor + 1).

date()
## [1] "Thu Oct  4 23:16:57 2018"
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
table(clusters)
## clusters
##    1    2    3    4    5 
## 2574 1422 5728 1993 3246
date()
## [1] "Thu Oct  4 23:17:49 2018"
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
date()
## [1] "Thu Oct  4 23:20:07 2018"
summary(sizeFactors(sce))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.0807  0.4197  0.7436  0.9992  1.2504 11.0729
sort(sizeFactors(sce))[1:30]
##  [1] -0.0807031952 -0.0760340652 -0.0502627636 -0.0497516071 -0.0333388762
##  [6] -0.0278556079 -0.0250192491 -0.0107172974 -0.0042912613 -0.0011366402
## [11] -0.0008049468  0.0062927996  0.0119193118  0.0290307001  0.0343657339
## [16]  0.0521217305  0.0683599087  0.0695881927  0.0711287055  0.0768475650
## [21]  0.0783454542  0.0791481088  0.0796070558  0.0813334824  0.0824948106
## [26]  0.0852443823  0.0878596351  0.0908209323  0.0938742916  0.0964200771
# Remove cells with negative or very small size factors
dim(sce)
## [1] 11281 14963
sce = sce[,which(sizeFactors(sce) > 0.01)]
dim(sce)
## [1] 11281 14951
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy", 
              xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy", 
     xlab="total counts", ylab="size factors", 
     cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
abline(h=0.05)

dim(sce)
## [1] 11281 14951
sce = sce[,which(sizeFactors(sce) > 0.05)]
dim(sce)
## [1] 11281 14948
sce = normalize(sce) 

3.5 Dimension Reduction

For dimension reduction, such as calculating PCA or performing TSNE, we should start by identifying a subset of genes with high level of biological signal relative to background (technical) noise. The decomposeVar function from R/cran is designed for this task.

new.trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))

par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6), 
     xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new.trend(x), col="red", lwd=2, add=TRUE)
legend("top", legend=c("Poisson noise", "observed trend"), 
       lty=1, lwd=2, col=c("red", "orange"), bty="n")

fit$trend = new.trend
# obtain bio, FDR, pvalues for testing for HVGs (highly variable genes)
dec = decomposeVar(fit=fit) 
top.dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top.dec)[1:10])

When performing PCA, we can use all the genes or just those genes with high signal-to-noise ratio. TSNE analysis is usually based on the top PCs rather than the original gene expression data. We select around top 1000 genes for the PCA and use the top 50 PCs for TSNE projection.

library(svd)
library(Rtsne)

summary(dec$bio)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.022172  0.000947  0.004376  0.011828  0.010775  5.132832
dec1 = dec
dec1$bio[which(dec$bio < 1e-5)] = 1e-5
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100

par(mfrow=c(1,2))
hist(log10(dec1$bio), breaks=100, main="")
hist(log10(dec1$FDR), breaks=100, main="")

summary(dec$FDR[dec$bio > 0.01])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 6.192e-06 0.000e+00 6.215e-03
summary(dec$FDR[dec$bio > 0.03])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 3.626e-11 0.000e+00 3.310e-08
table(dec$FDR < 1e-10, dec$bio > 0.03)
##        
##         FALSE TRUE
##   FALSE  4348    2
##   TRUE   6011  920
# Subsetting genes based on FDR and biological residual thresholds
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.03) # original gene filter
sce_sub = sce[w2kp,]
sce_sub
## class: SingleCellExperiment 
## dim: 920 14948 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(920): AATK ABCA8 ... ZNF644 ZSWIM6
## rowData names(20): hgnc_symbol external_gene_name ...
##   n_cells_counts pct_dropout_counts
## colnames(14948): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(59): is_cell_control total_features_by_counts ...
##   pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(0):
## spikeNames(0):
# Extracting log2(norm_express+1)
edat = t(as.matrix(logcounts(sce_sub)))
edat = scale(edat)
dim(edat)
## [1] 14948   920
edat[1:2,1:3]
##                         AATK      ABCA8        A2M
## hHP1_AACACTATCTAC -0.3209666 -0.2109691 -0.1499949
## hHP1_CTACGCATCCAT  0.2404263 -0.2109691 -0.1499949
# Perform SVD on sce_sub
date()
## [1] "Thu Oct  4 23:20:52 2018"
ppk = propack.svd(edat,neig=50)
date()
## [1] "Thu Oct  4 23:21:02 2018"
pca = t(ppk$d*t(ppk$u)) # calculates pc scores aka principal components

df_pcs = data.frame(pca)
df_pcs$log10_total_features = colData(sce_sub)$log10_total_features
df_pcs$part_cell_id = sapply(colnames(sce_sub),function(xx) 
  strsplit(xx,"_")[[1]][1],USE.NAMES=FALSE)
df_pcs[1:2,1:10]
##           X1       X2       X3        X4        X5         X6        X7
## 1 -0.6397168 3.029322 2.126551 0.2110893 -1.250563 -0.8411983 0.3393688
## 2  0.1510047 3.194259 1.433259 1.3759974 -1.516416 -1.5550675 0.2770684
##          X8         X9       X10
## 1 0.3637856 -0.5986222 1.7335946
## 2 1.6677843  0.2782422 0.3932075
gp1 = ggplot(df_pcs, aes(X1,X2,col=log10_total_features)) + 
  geom_point(size=0.5,alpha=0.6) + theme_classic() + 
  scale_colour_gradient(low="lightblue",high="red") +
  guides(color = guide_legend(override.aes = list(size=3)))
gp1

gp2 = ggplot(df_pcs, aes(X1,X2,col=part_cell_id)) + 
  geom_point(size=0.5,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3)))
gp2

set.seed(100)
date()
## [1] "Thu Oct  4 23:21:04 2018"
tsne = Rtsne(pca, pca = FALSE)
date()
## [1] "Thu Oct  4 23:23:47 2018"
df_tsne = data.frame(tsne$Y)
df_tsne$log10_total_features = colData(sce_sub)$log10_total_features
df_tsne$part_cell_id = sapply(colnames(sce_sub),function(xx) 
  strsplit(xx,"_")[[1]][1],USE.NAMES=FALSE)
dim(df_tsne)
## [1] 14948     4
df_tsne[1:2,]
##          X1       X2 log10_total_features part_cell_id
## 1 -2.571519 8.399695             3.393048         hHP1
## 2 -2.742694 5.133562             3.453624         hHP1
gp1 = ggplot(df_tsne, aes(X1,X2,col=log10_total_features)) + 
  geom_point(size=0.3,alpha=0.6) + theme_classic() + 
  scale_colour_gradient(low="lightblue",high="red") +
  guides(color = guide_legend(override.aes = list(size=3)))
gp1

gp2 = ggplot(df_tsne, aes(X1,X2,col=part_cell_id)) + 
  geom_point(size=0.3,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3)))
gp2

reducedDims(sce_sub) = SimpleList(PCA=pca, TSNE=tsne$Y)
sce_sub
## class: SingleCellExperiment 
## dim: 920 14948 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(920): AATK ABCA8 ... ZNF644 ZSWIM6
## rowData names(20): hgnc_symbol external_gene_name ...
##   n_cells_counts pct_dropout_counts
## colnames(14948): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(59): is_cell_control total_features_by_counts ...
##   pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):

3.6 Clustering

3.6.1 Kmeans

Next we cluster the cells using a simple kmeans method on the top 50 PCs. We set the number of clusters to be 12, to match with the 12 cell types reported by (Habib et al. 2017).

k10_50_pcs = kmeans(reducedDim(sce_sub, "PCA"), 
                    centers = 12, iter.max=150, algorithm="MacQueen")
names(k10_50_pcs)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
dim(k10_50_pcs$centers)
## [1] 12 50
df_tsne$cluster_kmean = as.factor(k10_50_pcs$cluster)

gp1 = ggplot(df_tsne, aes(X1,X2,col=cluster_kmean)) + 
  geom_point(size=0.3,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3)))
gp1

3.6.2 SC3

Next seek to cluster cells using another method SC3. The code used here is based on SC3 mannual. By default, when there are more than 5000 genes, SC3 will

selects a subset of cells uniformly at random (5,000), and obtains clusters from this subset. The inferred labels can be used to train a Support Vector Machine (SVM), which is employed to assign labels to the remaining cells.

By default, SC3 filter genes to select those with dropout percentage between 10 and 90%.

summary(rowData(sce)$pct_dropout_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.34   93.42   96.32   94.53   97.95   99.79
table(rowData(sce)$pct_dropout_counts < 90)
## 
## FALSE  TRUE 
##  9863  1418

This will end up with 1418 genes. However, we found the clustering resutls using these 1418 genes have considerable discrepency with clustering resutls from Kmeans and cell types reported by (Habib et al. 2017). Therefore, we chose to use those genes identified from previous step for SC3.

library(SC3)
rowData(sce_sub)$feature_symbol = rowData(sce_sub)$external_gene_name
date()
## [1] "Thu Oct  4 23:23:54 2018"
all_ks = c(10,12)
sce_sub = sc3(sce_sub, gene_filter=FALSE, ks = all_ks, biology = TRUE, 
              rand_seed = 100)
date()
## [1] "Fri Oct  5 00:17:53 2018"
dim(colData(sce_sub))
## [1] 14948    63
colData(sce_sub)[1:2,1:5]
## DataFrame with 2 rows and 5 columns
##                   is_cell_control total_features_by_counts
##                         <logical>                <integer>
## hHP1_AACACTATCTAC           FALSE                     2471
## hHP1_CTACGCATCCAT           FALSE                     2841
##                   log10_total_features_by_counts total_counts
##                                        <numeric>    <integer>
## hHP1_AACACTATCTAC               3.39304846641678         4484
## hHP1_CTACGCATCCAT               3.45362407359145         4919
##                   log10_total_counts
##                            <numeric>
## hHP1_AACACTATCTAC   3.65176244738011
## hHP1_CTACGCATCCAT   3.69196510276736
names(colData(sce_sub))
##  [1] "is_cell_control"                               
##  [2] "total_features_by_counts"                      
##  [3] "log10_total_features_by_counts"                
##  [4] "total_counts"                                  
##  [5] "log10_total_counts"                            
##  [6] "pct_counts_in_top_50_features"                 
##  [7] "pct_counts_in_top_100_features"                
##  [8] "pct_counts_in_top_200_features"                
##  [9] "pct_counts_in_top_500_features"                
## [10] "total_features"                                
## [11] "log10_total_features"                          
## [12] "pct_counts_top_50_features"                    
## [13] "pct_counts_top_100_features"                   
## [14] "pct_counts_top_200_features"                   
## [15] "pct_counts_top_500_features"                   
## [16] "total_features_by_counts_endogenous"           
## [17] "log10_total_features_by_counts_endogenous"     
## [18] "total_counts_endogenous"                       
## [19] "log10_total_counts_endogenous"                 
## [20] "pct_counts_endogenous"                         
## [21] "pct_counts_in_top_50_features_endogenous"      
## [22] "pct_counts_in_top_100_features_endogenous"     
## [23] "pct_counts_in_top_200_features_endogenous"     
## [24] "pct_counts_in_top_500_features_endogenous"     
## [25] "total_features_endogenous"                     
## [26] "log10_total_features_endogenous"               
## [27] "pct_counts_top_50_features_endogenous"         
## [28] "pct_counts_top_100_features_endogenous"        
## [29] "pct_counts_top_200_features_endogenous"        
## [30] "pct_counts_top_500_features_endogenous"        
## [31] "total_features_by_counts_feature_control"      
## [32] "log10_total_features_by_counts_feature_control"
## [33] "total_counts_feature_control"                  
## [34] "log10_total_counts_feature_control"            
## [35] "pct_counts_feature_control"                    
## [36] "pct_counts_in_top_50_features_feature_control" 
## [37] "pct_counts_in_top_100_features_feature_control"
## [38] "total_features_feature_control"                
## [39] "log10_total_features_feature_control"          
## [40] "pct_counts_top_50_features_feature_control"    
## [41] "pct_counts_top_100_features_feature_control"   
## [42] "total_features_by_counts_Mt"                   
## [43] "log10_total_features_by_counts_Mt"             
## [44] "total_counts_Mt"                               
## [45] "log10_total_counts_Mt"                         
## [46] "pct_counts_Mt"                                 
## [47] "total_features_Mt"                             
## [48] "log10_total_features_Mt"                       
## [49] "total_features_by_counts_Ri"                   
## [50] "log10_total_features_by_counts_Ri"             
## [51] "total_counts_Ri"                               
## [52] "log10_total_counts_Ri"                         
## [53] "pct_counts_Ri"                                 
## [54] "pct_counts_in_top_50_features_Ri"              
## [55] "pct_counts_in_top_100_features_Ri"             
## [56] "total_features_Ri"                             
## [57] "log10_total_features_Ri"                       
## [58] "pct_counts_top_50_features_Ri"                 
## [59] "pct_counts_top_100_features_Ri"                
## [60] "sc3_10_clusters"                               
## [61] "sc3_12_clusters"                               
## [62] "sc3_10_log2_outlier_score"                     
## [63] "sc3_12_log2_outlier_score"
table(colData(sce_sub)$sc3_12_clusters, useNA="ifany")
## 
##    1    2    3    4    5    6    7    8    9   10   11   12 <NA> 
##  644  678  206  703 1043  292  263   43  597   27  402  102 9948
table(colData(sce_sub)$sc3_10_clusters, useNA="ifany")
## 
##    1    2    3    4    5    6    7    8    9   10 <NA> 
## 1394  241  605  499   21    4 1054  643   73  466 9948
# Run SVM and predict labels of all other cells
date()
## [1] "Fri Oct  5 00:17:53 2018"
sce_sub = sc3_run_svm(sce_sub, ks = all_ks)
date()
## [1] "Fri Oct  5 00:20:14 2018"
table(colData(sce_sub)$sc3_12_clusters, useNA="ifany")
## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
## 1938  687  988  879 2358  742 1590 2322 1209  517  432 1286
table(colData(sce_sub)$sc3_10_clusters, useNA="ifany")
## 
##    1    2    3    4    5    6    7    8    9   10 
## 4376 1179 1034 1683  980   13 1055 2714 1340  574
sum(table(colData(sce_sub)$sc3_12_clusters))
## [1] 14948
sum(table(colData(sce_sub)$sc3_10_clusters))
## [1] 14948

3.7 Compare our kmeans to SC3 and to DrocNc clustering

We obtainted the cell type and clustering resutls from (Habib et al. 2017). Supplementary Table 10: supplement nmeth.4407-S10.xlsx file. Here we compare the cell type reported by Habib et al. (2017) with ours. Habib et al. (2017) identified genes with high variation by fitting a gamma distribution on the relation between mean and coefficient of variation and chose the number of PCs based on “the largest eigen value gap”. It was not clear what is the number of PCs used. Then they used these top PCs to perform tSNE anlaysis and clustering using a graph based method.

source(file.path(src_dir,"SOURCE.R"))

tmp_lab = smart_RT(file.path(src_dir, "cluster_num_label.txt"), 
                   sep = "\t",header = TRUE)
tmp_lab = name_change(tmp_lab, "Name", "Cluster.Name")
tmp_lab = name_change(tmp_lab, "Name.1", "Cell_Type")

dim(tmp_lab)
## [1] 15  4
tmp_lab[1:2,]
##   Cell.ID Cluster.Name Cell.Type.ID Cell_Type
## 1       1       exPFC1            1     exPFC
## 2       2       exPFC2            1     exPFC
tmp_res = smart_RT(file.path(src_dir, "paper_cluster_res.txt"), 
                   sep = "\t", header = TRUE, comment.char = "")
tmp_res = name_change(tmp_res, "X.Genes", "nGenes")
tmp_res = name_change(tmp_res, "X.Transcripts", "nTranscripts")

tmp_res = smart_merge(tmp_res, tmp_lab[,c("Cluster.Name","Cell_Type")], 
                      all.x=TRUE)
dim(tmp_res)
## [1] 14963     6
tmp_res[1:2,]
##   Cluster.Name          Cell.ID nGenes nTranscripts Cluster.ID Cell_Type
## 1         ASC1 hCd_TCCTTCCGTAAA    334          515          8       ASC
## 2         ASC1 hCd_ACGCCCAGGGCG    465          608          8       ASC
summary(tmp_res$nGenes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   201.0   337.0   529.0   658.4   826.0  4249.0
hist(tmp_res$nGenes, breaks=50, col="gray", main="", 
     xlab="number of genes per cell")

df_tsne$Cell.ID = colnames(sce_sub)

# Merge and compare
all_clust_res = smart_merge(df_tsne, tmp_res)
sc3_res = smart_df(colData(sce_sub)[,c("sc3_12_clusters", "sc3_10_clusters")])
sc3_res$Cell.ID = colnames(sce_sub)
all_clust_res = smart_merge(all_clust_res, sc3_res)
dim(all_clust_res)
## [1] 14948    13
all_clust_res[1:5,]
##            Cell.ID        X1          X2 log10_total_features part_cell_id
## 1 hCc_AAAAAGCTACAA 10.451712 -10.5185553             2.866287          hCc
## 2 hCc_AAACAGGTGAGG 25.893491   1.3019993             3.170262          hCc
## 3 hCc_AAACCCTTTACA 23.711009  -0.9235069             3.044540          hCc
## 4 hCc_AAACGTGACGGA 20.799815  -2.1608760             2.849419          hCc
## 5 hCc_AAAGAACTCGCG  2.945587  -3.9327487             2.716838          hCc
##   cluster_kmean Cluster.Name nGenes nTranscripts Cluster.ID Cell_Type
## 1            10       exPFC1    736         1034          1     exPFC
## 2             3        GABA1   1480         2391          5      GABA
## 3            10        GABA1   1107         1757          5      GABA
## 4            10        GABA1    708          935          5      GABA
## 5            10       exPFC1    520          673          1     exPFC
##   sc3_12_clusters sc3_10_clusters
## 1               5               1
## 2               8               1
## 3               5               1
## 4               8               1
## 5               5               1
smart_table(all_clust_res[,c("Cell_Type","Cluster.ID")])
##          Cluster.ID
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12   13
##     ASC      0    0    0    0    0    0    0 1204  705    0    0    0    0
##     END      0    0    0    0    0    0    0    0    0    0    0    0    0
##     exCA1    0    0  421    0    0    0    0    0    0    0    0    0    0
##     exCA3    0    0    0  745    0    0    0    0    0    0    0    0    0
##     exDG     0    0    0    0    0    0 1453    0    0    0    0    0    0
##     exPFC 3104  297    0    0    0    0    0    0    0    0    0    0    0
##     GABA     0    0    0    0  892  823    0    0    0    0    0    0    0
##     MG       0    0    0    0    0    0    0    0    0    0    0    0  389
##     NSC      0    0    0    0    0    0    0    0    0    0    0    0    0
##     ODC      0    0    0    0    0    0    0    0    0 1718 1247    0    0
##     OPC      0    0    0    0    0    0    0    0    0    0    0  684    0
##     <NA>     0    0    0    0    0    0    0    0    0    0    0    0    0
##          Cluster.ID
## Cell_Type   14   15   16   17   18
##     ASC      0    0    0    0    0
##     END      0  254    0    0    0
##     exCA1    0    0    0    0    0
##     exCA3    0    0    0    0    0
##     exDG     0    0    0    0    0
##     exPFC    0    0    0    0    0
##     GABA     0    0    0    0    0
##     MG       0    0    0    0    0
##     NSC    201    0    0    0    0
##     ODC      0    0    0    0    0
##     OPC      0    0    0    0    0
##     <NA>     0    0  200  123  488
smart_table(all_clust_res[,c("Cell_Type","cluster_kmean")])
##          cluster_kmean
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12
##     ASC      3    5    6    1   27 1790    3   28    0   35   11    0
##     END      0    0    0  237    3    3    1    1    0    9    0    0
##     exCA1    0    2    5    0   10    6   20   71    0   88  219    0
##     exCA3    0    1    4    0    4    0    4  545    0   94   93    0
##     exDG     0    0    0    0    3    1    2   86    0   16 1345    0
##     exPFC    6    9   16    5   67   51    9  378    0 2817   43    0
##     GABA     1    1 1045    1    3    2   47   99    0  505   11    0
##     MG       3    6    3  279   25    8    1   11    0   39   14    0
##     NSC      0    0    0    0    0    8    0    1  180    0   12    0
##     ODC      6 1093    3    0 1814    6    1    6    0   24   12    0
##     OPC    631    4    0    0    6    3    1   15    0   20    4    0
##     <NA>     5   11   13   10   42   15  266   20    1   14  266  148
smart_table(all_clust_res[,c("Cell_Type","sc3_12_clusters")])
##          sc3_12_clusters
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12
##     ASC   1808    4    6   10   13    3   14   23   10    3    6    9
##     END      3    1    0   81    3    0  155    3    3    3    0    2
##     exCA1    7    3    6    4   32   86    8   82  158    5   12   18
##     exCA3    1    1    5    0   55  181    2   95  395    4    3    3
##     exDG     1    0    1    0    9    4    1   22  472    1    4  938
##     exPFC   56   21  582   21  750   11   26 1522   44   12  323   33
##     GABA     4    2   59  154  205    5  232  491    5  446   23   89
##     MG      11    4    3  117   13    2  212   12    6    6    0    3
##     NSC      8    0    0   74    0    0  113    0    1    2    0    3
##     ODC     10  630    7  347 1238    4  693   15    3    6    4    8
##     OPC      6    5  201    1    8  432    3   13    2    9    1    3
##     <NA>    23   16  118   70   32   14  131   44  110   20   56  177
smart_table(all_clust_res[,c("Cell_Type","sc3_10_clusters")])
##          sc3_10_clusters
## Cell_Type    1    2    3    4    5    6    7    8    9   10
##     ASC     37   15   11   15    9    3   12  628 1179    0
##     END      5  151    1    1    4    1    1    5    4   81
##     exCA1  345    5   17   24   11    1    5    7    6    0
##     exCA3  711    4   11   11    5    0    1    2    0    0
##     exDG    56    7  466  918    1    0    0    4    1    0
##     exPFC 2268   26   30  354  587    1   32   57   40    6
##     GABA   827  187  355  114  221    1    1    4    5    0
##     MG      14  205    8    2    6    1    8   24   11  110
##     NSC      1  113    2    3    2    0    0    0    5   75
##     ODC     14    2    8   10   12    3  972 1940    3    1
##     OPC     14  430    7    4    6    1    2    9   14  197
##     <NA>    84   34  118  227  116    1   21   34   72  104
t2 = melt(smart_table(all_clust_res[,c("Cell_Type","cluster_kmean")]))
colnames(t2)[2] = "cluster"
summary(t2$value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     4.0   103.8    20.0  2817.0
gg.heatmap(t2, "kmeans 12 clusters")

t2 = melt(smart_table(all_clust_res[,c("Cell_Type","sc3_12_clusters")]))
colnames(t2)[2] = "cluster"
summary(t2$value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00    8.50  103.81   61.75 1808.00
gg.heatmap(t2, "sc3 12 clusters")

t2 = melt(smart_table(all_clust_res[,c("Cell_Type","sc3_10_clusters")]))
colnames(t2)[2] = "cluster"
summary(t2$value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    2.00    8.00  124.57   60.75 2268.00
gg.heatmap(t2, "sc3 10 clusters")

We plot our TSNE colored by our clustering results.

gp1 = ggplot(all_clust_res, aes(X1,X2,col=Cell_Type)) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("cell type")
gp1

gp1 = ggplot(all_clust_res, aes(X1,X2,col=cluster_kmean)) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) +
  ggtitle("kmeans 12 clusters")
gp1

gp1 = ggplot(all_clust_res, aes(X1,X2,col=factor(sc3_10_clusters))) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("sc3 10 clusters")
gp1

gp1 = ggplot(all_clust_res, aes(X1,X2,col=factor(sc3_12_clusters))) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("sc3 12 clusters")
gp1

Next we plot the TSNE by Habib et al. (2017). and colored by our clustering results.

paper_tsne = smart_RT(file.path(dronc_dir, "GTEx_droncseq_hip_pcf.tsne.txt"), 
                      sep='\t',header=TRUE,row.names=1)
paper_tsne$Cell.ID = rownames(paper_tsne)
rownames(paper_tsne) = NULL
all_clust_res = smart_merge(all_clust_res,paper_tsne)

gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=Cell_Type)) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("cell types")
gp1

gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=cluster_kmean)) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("kmeans 12 clusters")
gp1

gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=factor(sc3_10_clusters))) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) +
  ggtitle("sc3 10 clusters")
gp1

gp1 = ggplot(all_clust_res, aes(tSNE_1,tSNE_2,col=factor(sc3_12_clusters))) + 
  geom_point(size=0.2,alpha=0.6) + theme_classic() + 
  guides(color = guide_legend(override.aes = list(size=3))) + 
  ggtitle("sc3 12 clusters")
gp1

Finally we save the sce object and the clustering results

sce_fn = file.path(dronc_dir, "sce.rds")
saveRDS(sce, sce_fn)

sce_sub_fn = file.path(dronc_dir, "sce_sub.rds")
saveRDS(sce_sub, sce_sub_fn)

all_clust_res_fn = file.path(dronc_dir, "all_clust_res.rds")
saveRDS(all_clust_res, all_clust_res_fn)

4 Session information

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] SC3_1.8.0                   Rtsne_0.13                 
##  [3] svd_0.4.1                   limma_3.36.5               
##  [5] scran_1.8.4                 scater_1.8.4               
##  [7] ggplot2_3.0.0               biomaRt_2.36.1             
##  [9] DropletUtils_1.0.3          SingleCellExperiment_1.2.0 
## [11] SummarizedExperiment_1.10.1 DelayedArray_0.6.4         
## [13] BiocParallel_1.14.2         matrixStats_0.54.0         
## [15] Biobase_2.40.0              GenomicRanges_1.32.6       
## [17] GenomeInfoDb_1.16.0         IRanges_2.14.10            
## [19] S4Vectors_0.18.3            BiocGenerics_0.26.0        
## [21] data.table_1.11.4          
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.6.0         colorspace_1.3-2        
##   [3] rjson_0.2.20             class_7.3-14            
##   [5] dynamicTreeCut_1.63-1    rprojroot_1.3-2         
##   [7] XVector_0.20.0           DT_0.4                  
##   [9] bit64_0.9-7              mvtnorm_1.0-8           
##  [11] AnnotationDbi_1.42.1     codetools_0.2-15        
##  [13] tximport_1.8.0           doParallel_1.0.11       
##  [15] robustbase_0.93-2        knitr_1.20              
##  [17] cluster_2.0.7-1          pheatmap_1.0.10         
##  [19] shinydashboard_0.7.0     shiny_1.1.0             
##  [21] rrcov_1.4-4              compiler_3.5.0          
##  [23] httr_1.3.1               backports_1.1.2         
##  [25] assertthat_0.2.0         Matrix_1.2-14           
##  [27] lazyeval_0.2.1           later_0.7.3             
##  [29] htmltools_0.3.6          prettyunits_1.0.2       
##  [31] tools_3.5.0              bindrcpp_0.2.2          
##  [33] igraph_1.2.2             gtable_0.2.0            
##  [35] glue_1.3.0               GenomeInfoDbData_1.1.0  
##  [37] reshape2_1.4.3           dplyr_0.7.6             
##  [39] doRNG_1.7.1              Rcpp_0.12.18            
##  [41] gdata_2.18.0             iterators_1.0.10        
##  [43] DelayedMatrixStats_1.2.0 stringr_1.3.1           
##  [45] mime_0.5                 irlba_2.3.2             
##  [47] rngtools_1.3.1           gtools_3.8.1            
##  [49] WriteXLS_4.0.0           statmod_1.4.30          
##  [51] XML_3.98-1.15            DEoptimR_1.0-8          
##  [53] edgeR_3.22.3             zlibbioc_1.26.0         
##  [55] scales_1.0.0             hms_0.4.2               
##  [57] promises_1.0.1           rhdf5_2.24.0            
##  [59] RColorBrewer_1.1-2       yaml_2.2.0              
##  [61] memoise_1.1.0            gridExtra_2.3           
##  [63] pkgmaker_0.27            stringi_1.2.4           
##  [65] RSQLite_2.1.1            pcaPP_1.9-73            
##  [67] foreach_1.4.4            e1071_1.7-0             
##  [69] caTools_1.17.1.1         bibtex_0.4.2            
##  [71] rlang_0.2.1              pkgconfig_2.0.1         
##  [73] bitops_1.0-6             evaluate_0.11           
##  [75] lattice_0.20-35          ROCR_1.0-7              
##  [77] purrr_0.2.5              Rhdf5lib_1.2.1          
##  [79] bindr_0.1.1              htmlwidgets_1.2         
##  [81] labeling_0.3             cowplot_0.9.3           
##  [83] bit_1.1-14               tidyselect_0.2.4        
##  [85] plyr_1.8.4               magrittr_1.5            
##  [87] R6_2.2.2                 gplots_3.0.1            
##  [89] DBI_1.0.0                pillar_1.3.0            
##  [91] withr_2.1.2              RCurl_1.95-4.11         
##  [93] tibble_1.4.2             crayon_1.3.4            
##  [95] KernSmooth_2.23-15       rmarkdown_1.10          
##  [97] viridis_0.5.1            progress_1.2.0          
##  [99] locfit_1.5-9.1           grid_3.5.0              
## [101] blob_1.1.1               FNN_1.1.2.1             
## [103] digest_0.6.15            xtable_1.8-2            
## [105] httpuv_1.4.5             munsell_0.5.0           
## [107] registry_0.5             beeswarm_0.2.3          
## [109] viridisLite_0.3.0        vipor_0.4.5

Reference

Habib, Naomi, Inbal Avraham-Davidi, Anindita Basu, Tyler Burks, Karthik Shekhar, Matan Hofree, Sourav R Choudhury, et al. 2017. “Massively Parallel Single-Nucleus Rna-Seq with Dronc-Seq.” Nature Methods 14 (10). Nature Publishing Group: 955.

Lun, Aaron TL, Karsten Bach, and John C Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell Rna Sequencing Data with Many Zero Counts.” Genome Biology 17 (1). BioMed Central: 75.